Based on the analysis conducted in this report, the unemployment rate is forecasted to be 3.79% and total nonfarm payroll employment is expected to increase by 170,000 in April 2019. This report will explore various forecasting methods and explain the choices that were made in generating the above forecast estimates.

suppressMessages(library(fredr))
suppressMessages(library(fpp2))
suppressMessages(library(dynlm))
suppressMessages(library(quantreg))
suppressMessages(library(ggplot2))
suppressMessages(library(vars))
suppressMessages(library(tseries))
suppressMessages(library(forecast))
suppressMessages(library(glmnet))
suppressMessages(library(rstanarm))
suppressMessages(library(rstan))
suppressMessages(library(prophet))
suppressMessages(library(rpart)) 
suppressMessages(library(randomForest)) 
suppressMessages(library(xgboost)) 
suppressMessages(library(kernlab)) 
suppressMessages(library(caret)) 
suppressMessages(library(plyr)) 
suppressMessages(library(dplyr))
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))
suppressMessages(library(tseries))
suppressMessages(library(rugarch)) 
suppressMessages(library(rstan))
suppressMessages(library(loo)) 
suppressMessages(library(bridgesampling))
suppressMessages(library(mgcv))
suppressMessages(library(gridExtra))

fredr_set_key("8782f247febb41f291821950cf9118b6")
UNRATE <- fredr(series_id = "UNRATE")
PAYEMS <- fredr(series_id = "PAYEMS",units="chg")

unrate = ts(UNRATE[,3], start = c(1948,1), frequency = 12)
autoplot(unrate) + labs(x = "Date", y = "Unemployment Rate (%)", title = "U.S Unemployment Rate (1948 - 2019)") + theme(legend.position = "none")

payems = ts(PAYEMS[,3], start = c(1939,1), frequency = 12)
autoplot(payems) + labs(x = "Date", y = "Change in Employment (100s of thousands)", title = "Change in Total U.S Nonfarm Payroll Employment Level (1939 - 2019)") + theme(legend.position = "none") + theme(plot.title = element_text(size=12))

Exploring Various Forecasting Methods

Autoregressive Forecasts using Empirical Risk Minimization

# Unrate 

# Forecasting using AR(1-5) (Square Loss)
unrate_AR1_selection = Arima(unrate,order=c(1,0,0),include.mean=FALSE)
(unrate_AR1 = forecast(unrate_AR1_selection, h = 4))
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.796386 3.528925 4.063848 3.387339 4.205434
## May 2019       3.792776 3.414708 4.170844 3.214571 4.370981
## Jun 2019       3.789170 3.326353 4.251986 3.081353 4.496987
## Jul 2019       3.785566 3.251406 4.319727 2.968638 4.602495
unrate_AR2_selection = Arima(unrate,order=c(2,0,0),include.mean=FALSE)
(unrate_AR2 = forecast(unrate_AR2_selection, h = 4))
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.796318 3.530569 4.062067 3.389890 4.202746
## May 2019       3.792207 3.393852 4.190561 3.182976 4.401438
## Jun 2019       3.788049 3.289295 4.286803 3.025270 4.550828
## Jul 2019       3.783890 3.201749 4.366031 2.893582 4.674197
unrate_AR3_selection = Arima(unrate,order=c(3,0,0),include.mean=FALSE)
(unrate_AR3 = forecast(unrate_AR3_selection, h = 4))
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.740787 3.485067 3.996506 3.349697 4.131876
## May 2019       3.731364 3.354080 4.108648 3.154358 4.308370
## Jun 2019       3.710003 3.196038 4.223968 2.923962 4.496044
## Jul 2019       3.701287 3.073373 4.329200 2.740976 4.661597
unrate_AR4_selection = Arima(unrate,order=c(4,0,0),include.mean=FALSE)
(unrate_AR4 = forecast(unrate_AR4_selection, h = 4))
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.760875 3.509035 4.012714 3.375720 4.146030
## May 2019       3.719155 3.356548 4.081762 3.164595 4.273715
## Jun 2019       3.702590 3.214905 4.190276 2.956740 4.448440
## Jul 2019       3.679400 3.064237 4.294563 2.738589 4.620211
unrate_AR5_selection = Arima(unrate,order=c(5,0,0),include.mean=FALSE)
(unrate_AR5 = forecast(unrate_AR5_selection, h = 4))
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.782527 3.531511 4.033544 3.398631 4.166424
## May 2019       3.750936 3.392410 4.109461 3.202618 4.299253
## Jun 2019       3.723374 3.246172 4.200576 2.993557 4.453191
## Jul 2019       3.707142 3.109048 4.305235 2.792436 4.621847
# Evaluation for Square Loss Methods for Unrate
summary(unrate_AR1_selection)
## Series: unrate 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.9990
## s.e.  0.0009
## 
## sigma^2 estimated as 0.04356:  log likelihood=123.83
## AIC=-243.67   AICc=-243.65   BIC=-234.16
## 
## Training set error measures:
##                       ME      RMSE      MAE        MPE     MAPE      MASE
## Training set 0.006110933 0.2085793 0.146867 0.04154946 2.671476 0.1693924
##                   ACF1
## Training set 0.1184901
summary(unrate_AR2_selection)
## Series: unrate 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2
##       1.1167  -0.1176
## s.e.  0.0340   0.0340
## 
## sigma^2 estimated as 0.043:  log likelihood=129.77
## AIC=-253.53   AICc=-253.5   BIC=-239.28
## 
## Training set error measures:
##                       ME      RMSE       MAE        MPE     MAPE      MASE
## Training set 0.006146492 0.2071224 0.1477288 0.05177387 2.695145 0.1703863
##                     ACF1
## Training set -0.03136932
summary(unrate_AR3_selection)
## Series: unrate 
## ARIMA(3,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2      ar3
##       1.0848  0.1881  -0.2741
## s.e.  0.0329  0.0492   0.0330
## 
## sigma^2 estimated as 0.03982:  log likelihood=163.01
## AIC=-318.02   AICc=-317.97   BIC=-299.02
## 
## Training set error measures:
##                       ME      RMSE       MAE        MPE     MAPE     MASE
## Training set 0.007006931 0.1991887 0.1446792 0.09418484 2.662406 0.166869
##                    ACF1
## Training set -0.0482898
summary(unrate_AR4_selection)
## Series: unrate 
## ARIMA(4,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2      ar3      ar4
##       1.0359  0.2218  -0.0824  -0.1767
## s.e.  0.0337  0.0489   0.0489   0.0337
## 
## sigma^2 estimated as 0.03862:  log likelihood=176.5
## AIC=-343.01   AICc=-342.94   BIC=-319.25
## 
## Training set error measures:
##                       ME     RMSE       MAE       MPE     MAPE      MASE
## Training set 0.007759897 0.196051 0.1424413 0.1254265 2.620506 0.1642879
##                     ACF1
## Training set -0.01521411
summary(unrate_AR5_selection)
## Series: unrate 
## ARIMA(5,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2      ar3      ar4      ar5
##       1.0198  0.2146  -0.0620  -0.0860  -0.0879
## s.e.  0.0342  0.0488   0.0494   0.0488   0.0343
## 
## sigma^2 estimated as 0.03836:  log likelihood=179.77
## AIC=-347.54   AICc=-347.45   BIC=-319.04
## 
## Training set error measures:
##                       ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 0.008157459 0.1952956 0.1410063 0.1416879 2.597137 0.1626329
##                   ACF1
## Training set -0.008714
checkresiduals(unrate_AR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 252.7, df = 23, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 24
checkresiduals(unrate_AR2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 181.29, df = 22, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 24
checkresiduals(unrate_AR3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,0) with zero mean
## Q* = 85.417, df = 21, p-value = 9.847e-10
## 
## Model df: 3.   Total lags used: 24
checkresiduals(unrate_AR4)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,0) with zero mean
## Q* = 66.018, df = 20, p-value = 8.027e-07
## 
## Model df: 4.   Total lags used: 24
checkresiduals(unrate_AR5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with zero mean
## Q* = 64.413, df = 19, p-value = 7.604e-07
## 
## Model df: 5.   Total lags used: 24
#Plotting Forecasts
autoplot(subset(unrate, start = 800), series = "Observed") + autolayer(unrate_AR1, series = "AR(1)", PI = FALSE) + autolayer(unrate_AR2, series = "AR(2)", PI = FALSE) + autolayer(unrate_AR3, series = "AR(3)", PI = FALSE) + autolayer(unrate_AR4, series = "AR(4)", PI = FALSE) + autolayer(unrate_AR5, series = "AR(5)", PI = FALSE) + ggtitle("Forecasts for Unemployment Rate") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Unemployment Rate (%)")

# Payems 
# Forecasting using AR(1-5) (Square Loss)
payems_AR1_selection = Arima(payems,order=c(1,0,0),include.mean=FALSE)
(payems_AR1 = forecast(payems_AR1_selection, h = 4))
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019      124.85988 -129.2481 378.9678 -263.7647 513.4844
## May 2019       79.54076 -221.7481 380.8297 -381.2408 540.3224
## Jun 2019       50.67066 -267.7774 369.1187 -436.3536 537.6950
## Jul 2019       32.27925 -292.8742 357.4326 -465.0000 529.5585
payems_AR2_selection = Arima(payems,order=c(2,0,0),include.mean=FALSE)
(payems_AR2 = forecast(payems_AR2_selection, h = 4))
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019       88.02816 -144.8883 320.9447 -268.1869 444.2432
## May 2019      112.06959 -137.2391 361.3783 -269.2151 493.3543
## Jun 2019       78.02065 -201.8558 357.8971 -350.0134 506.0547
## Jul 2019       74.64842 -217.6029 366.8998 -372.3114 521.6083
payems_AR3_selection = Arima(payems,order=c(3,0,0),include.mean=FALSE)
(payems_AR3 = forecast(payems_AR3_selection, h = 4))
##          Point Forecast      Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019      133.81502  -94.15162 361.7817 -214.8299 482.4599
## May 2019      109.66524 -128.23537 347.5658 -254.1723 473.5028
## Jun 2019      116.32820 -139.26567 371.9221 -274.5689 507.2253
## Jul 2019       97.66526 -175.72039 371.0509 -320.4420 515.7725
payems_AR4_selection = Arima(payems,order=c(4,0,0),include.mean=FALSE)
(payems_AR4 = forecast(payems_AR4_selection, h = 4))
##          Point Forecast      Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019       142.4935  -84.31657 369.3035 -204.3826 489.3695
## May 2019       134.4093 -100.92589 369.7445 -225.5048 494.3235
## Jun 2019       116.0046 -133.35616 365.3654 -265.3598 497.3690
## Jul 2019       116.3982 -145.68936 378.4857 -284.4301 517.2265
payems_AR5_selection = Arima(payems,order=c(5,0,0),include.mean=FALSE)
(payems_AR5 = forecast(payems_AR5_selection, h = 4))
##          Point Forecast      Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019       144.1258  -81.67221 369.9239 -201.2025 489.4541
## May 2019       142.9635  -90.70492 376.6320 -214.4015 500.3286
## Jun 2019       139.3661 -106.62747 385.3596 -236.8486 515.5807
## Jul 2019       115.3776 -140.34509 371.1002 -275.7165 506.4716
# Evaluation for Square Loss Methods for Payems
summary(payems_AR1_selection)
## Series: payems 
## ARIMA(1,0,0) with zero mean 
## 
## Coefficients:
##          ar1
##       0.6370
## s.e.  0.0248
## 
## sigma^2 estimated as 39316:  log likelihood=-6453.46
## AIC=12910.92   AICc=12910.93   BIC=12920.65
## 
## Training set error measures:
##                   ME     RMSE     MAE MPE MAPE      MASE      ACF1
## Training set 45.7002 198.1784 132.692 Inf  Inf 0.6262197 -0.324929
summary(payems_AR2_selection)
## Series: payems 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2
##       0.3817  0.4003
## s.e.  0.0295  0.0295
## 
## sigma^2 estimated as 33032:  log likelihood=-6369.36
## AIC=12744.72   AICc=12744.75   BIC=12759.33
## 
## Training set error measures:
##                    ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 27.45412 181.5567 118.8735 Inf  Inf 0.5610055 -0.1079646
summary(payems_AR3_selection)
## Series: payems 
## ARIMA(3,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2     ar3
##       0.2984  0.3208  0.2075
## s.e.  0.0315  0.0313  0.0316
## 
## sigma^2 estimated as 31642:  log likelihood=-6348.26
## AIC=12704.52   AICc=12704.56   BIC=12724
## 
## Training set error measures:
##                   ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 21.8769 177.6057 117.1603 Inf  Inf 0.5529201 -0.03705291
summary(payems_AR4_selection)
## Series: payems 
## ARIMA(4,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4
##       0.2767  0.2869  0.1757  0.1056
## s.e.  0.0320  0.0328  0.0329  0.0321
## 
## sigma^2 estimated as 31322:  log likelihood=-6342.89
## AIC=12695.78   AICc=12695.84   BIC=12720.12
## 
## Training set error measures:
##                    ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 19.62167 176.6125 116.3833 Inf  Inf 0.5492534 -0.02281626
summary(payems_AR5_selection)
## Series: payems 
## ARIMA(5,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4     ar5
##       0.2663  0.2696  0.1469  0.0778  0.0996
## s.e.  0.0320  0.0331  0.0340  0.0332  0.0321
## 
## sigma^2 estimated as 31043:  log likelihood=-6338.11
## AIC=12688.22   AICc=12688.3   BIC=12717.43
## 
## Training set error measures:
##                    ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 17.72833 175.7327 116.5471 Inf  Inf 0.5500264 -0.003355769
checkresiduals(payems_AR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with zero mean
## Q* = 168.52, df = 23, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 24
checkresiduals(payems_AR2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,0) with zero mean
## Q* = 69.617, df = 22, p-value = 7.599e-07
## 
## Model df: 2.   Total lags used: 24
checkresiduals(payems_AR3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,0) with zero mean
## Q* = 44.922, df = 21, p-value = 0.001773
## 
## Model df: 3.   Total lags used: 24
checkresiduals(payems_AR4)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,0,0) with zero mean
## Q* = 34.234, df = 20, p-value = 0.02459
## 
## Model df: 4.   Total lags used: 24
checkresiduals(payems_AR5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with zero mean
## Q* = 26.895, df = 19, p-value = 0.1071
## 
## Model df: 5.   Total lags used: 24
#Plotting Forecasts
autoplot(subset(payems, start = 800), series = "Observed") + autolayer(payems_AR1, series = "AR(1)", PI = FALSE) + autolayer(payems_AR2, series = "AR(2)", PI = FALSE) + autolayer(payems_AR3, series = "AR(3)", PI = FALSE) + autolayer(payems_AR4, series = "AR(4)", PI = FALSE) + autolayer(payems_AR5, series = "AR(5)", PI = FALSE) + ggtitle("Forecasts for Changes in Nonfarm Payroll Employment") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)")

Residual plots look similar for all methods; however, in an effort to be more selective, I will remove AR(1) and AR(2) forecasts from my final average, as their RMSE values are slightly worse than the other methods.

Vector Autoregressions Forecasts Incorporating Additional Variables

Below I created vector autoregressions jointly in unrate, payems, prices of the NASDAQ (Fred Series: NASDAQCOM), US Retail Gas Prices (Fred Series: GASREGW), Consumer Price Index (Fred Series: CPIAUCSL), and the effective Federal Funds Rate (Fred Series: EFFR). I decided to choose these variables after conducting some independent research online on economic that have been found to be linked to employment figures. The following research study was used in my decision-making process: https://pdfs.semanticscholar.org/0ab6/643904b461588428735b0b8c4b8e686bab35.pdf.

I found that all the series’ I added had issues with stationarity and dependence. I discovered the stationarity issue by using the Augmented Dickey-Fuller Test, which tests the null hypothesis of no stationarity. I adjusted the time series’ by using the diff function as shown below - this improved the stationarity and dependency issues.

#Loading NASDAQ, GAS prices, CPI, and FFR data
NASDAQ = read.csv("NASDAQCOM.csv")
nasdaq =  ts(NASDAQ[,2], start = c(1971,3), frequency = 12)
ggseasonplot(nasdaq)

adf.test(nasdaq)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  nasdaq
## Dickey-Fuller = -0.69033, Lag order = 8, p-value = 0.9714
## alternative hypothesis: stationary
nasdaq = diff(nasdaq, differences = ndiffs(nasdaq))
adf.test(nasdaq)
## Warning in adf.test(nasdaq): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  nasdaq
## Dickey-Fuller = -13.733, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
acf(nasdaq)

GAS = read.csv("GASREGW.csv")
gas =  ts(GAS[,2], start = c(1990,9), frequency = 12)
ggseasonplot(gas)

adf.test(gas)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  gas
## Dickey-Fuller = -2.2635, Lag order = 6, p-value = 0.4656
## alternative hypothesis: stationary
gas = diff(gas, differences = ndiffs(gas))
adf.test(gas)
## Warning in adf.test(gas): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  gas
## Dickey-Fuller = -9.5982, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(gas)

CPI = read.csv("CPIAUCSL.csv")
cpi =  ts(CPI[,2], start = c(1947,1), frequency = 12)
ggseasonplot(cpi)

adf.test(cpi)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  cpi
## Dickey-Fuller = -2.8571, Lag order = 9, p-value = 0.2155
## alternative hypothesis: stationary
cpi = diff(cpi, differences = ndiffs(cpi))
adf.test(cpi)
## Warning in adf.test(cpi): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  cpi
## Dickey-Fuller = -16.481, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
acf(cpi)

FFR = read.csv("EFFR.csv")
ffr = ts(FFR[,2], start = c(2000,7), frequency = 12)
ggseasonplot(ffr)

adf.test(ffr)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ffr
## Dickey-Fuller = -2.9627, Lag order = 6, p-value = 0.1716
## alternative hypothesis: stationary
ffr = diff(ffr, differences = ndiffs(ffr))
adf.test(nasdaq)
## Warning in adf.test(nasdaq): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  nasdaq
## Dickey-Fuller = -13.733, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
acf(nasdaq)

df_2 = na.omit(cbind(unrate, payems, nasdaq, gas, cpi, ffr))
colnames(df_2) = c("unrate", "payems", "nasdaq", "gas", "cpi", "ffr")

var.1 = VAR(df_2, p = 1)
(var.1_forecast = forecast(var.1,4))
## unrate
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.784548 3.612656 3.956440 3.521661 4.047435
## May 2019       3.776178 3.516622 4.035735 3.379221 4.173136
## Jun 2019       3.773278 3.428418 4.118138 3.245860 4.300696
## Jul 2019       3.779829 3.348421 4.211237 3.120047 4.439610
## 
## payems
##          Point Forecast       Lo 80    Hi 80      Lo 95    Hi 95
## Apr 2019       170.5150    3.829352 337.2007  -84.40871 425.4388
## May 2019       166.8042  -47.666709 381.2750 -161.20073 494.8091
## Jun 2019       152.3719  -88.867382 393.6113 -216.57177 521.3157
## Jul 2019       140.0371 -117.730368 397.8045 -254.18422 534.2584
## 
## nasdaq
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019     48.7074963 -158.7602 256.1752 -268.5870 366.0020
## May 2019    -20.0349777 -243.2204 203.1504 -361.3676 321.2976
## Jun 2019      5.1537569 -218.5821 228.8896 -337.0207 347.3282
## Jul 2019      0.5525415 -223.3046 224.4097 -341.8074 342.9125
## 
## gas
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019   0.0581049634 -0.1262646 0.2424745 -0.2238639 0.3400739
## May 2019   0.0123192813 -0.2022523 0.2268908 -0.3158396 0.3404781
## Jun 2019  -0.0007349428 -0.2177075 0.2162376 -0.3325659 0.3310960
## Jul 2019  -0.0050158078 -0.2222120 0.2121803 -0.3371886 0.3271570
## 
## cpi
##          Point Forecast      Lo 80     Hi 80     Lo 95     Hi 95
## Apr 2019    -0.47545204 -1.2466688 0.2957647 -1.654926 0.7040222
## May 2019    -0.12631997 -0.9520481 0.6994082 -1.389162 1.1365222
## Jun 2019    -0.03702156 -0.8948902 0.8208471 -1.349018 1.2749753
## Jul 2019    -0.01691086 -0.8768941 0.8430724 -1.332142 1.2983200
## 
## ffr
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019   -0.052188405 -0.2149164 0.1105395 -0.3010593 0.1966825
## May 2019    0.010611978 -0.1592988 0.1805228 -0.2492442 0.2704681
## Jun 2019   -0.012254638 -0.1831584 0.1586491 -0.2736294 0.2491201
## Jul 2019   -0.003828483 -0.1748858 0.1672288 -0.2654380 0.2577811
var.2 = VAR(df_2, p = 2)
(var.2_forecast = forecast(var.2,4))
## unrate
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.819508 3.656120 3.982896 3.569627 4.069389
## May 2019       3.817063 3.600182 4.033943 3.485373 4.148753
## Jun 2019       3.855172 3.573062 4.137282 3.423723 4.286622
## Jul 2019       3.880000 3.529658 4.230342 3.344198 4.415801
## 
## payems
##          Point Forecast      Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019       101.0290  -53.62355 255.6815 -135.4917 337.5496
## May 2019       126.7408  -48.60800 302.0897 -141.4321 394.9137
## Jun 2019       113.3847  -87.37936 314.1488 -193.6574 420.4269
## Jul 2019       100.0482 -117.51908 317.6154 -232.6922 432.7886
## 
## nasdaq
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Apr 2019       45.44990 -152.5305 243.4303 -257.3350 348.2348
## May 2019       36.73652 -180.7761 254.2491 -295.9203 369.3933
## Jun 2019      -24.07420 -243.9751 195.8267 -360.3836 312.2352
## Jul 2019       12.94950 -208.8434 234.7424 -326.2534 352.1524
## 
## gas
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019    0.044054657 -0.1355045 0.2236138 -0.2305573 0.3186667
## May 2019   -0.012304366 -0.2279236 0.2033149 -0.3420655 0.3174568
## Jun 2019   -0.004416892 -0.2241993 0.2153655 -0.3405450 0.3317113
## Jul 2019   -0.014093785 -0.2343284 0.2061408 -0.3509135 0.3227259
## 
## cpi
##          Point Forecast      Lo 80     Hi 80     Lo 95     Hi 95
## Apr 2019    -0.53474505 -1.2678239 0.1983338 -1.655892 0.5864024
## May 2019    -0.24408589 -1.0313928 0.5432210 -1.448168 0.9599962
## Jun 2019     0.11397957 -0.7356153 0.9635744 -1.185364 1.4133228
## Jul 2019    -0.02829024 -0.8981828 0.8416024 -1.358676 1.3020957
## 
## ffr
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019    0.011984888 -0.1452179 0.1691876 -0.2284360 0.2524058
## May 2019   -0.003680842 -0.1711289 0.1637672 -0.2597706 0.2524089
## Jun 2019   -0.021510859 -0.1924420 0.1494203 -0.2829275 0.2399058
## Jul 2019   -0.010154719 -0.1828404 0.1625309 -0.2742546 0.2539452
var.3 = VAR(df_2, p = 3)
(var.3_forecast = forecast(var.3,4))
## unrate
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.776989 3.612503 3.941474 3.525430 4.028548
## May 2019       3.770655 3.555019 3.986292 3.440867 4.100443
## Jun 2019       3.825203 3.555116 4.095290 3.412141 4.238265
## Jul 2019       3.853711 3.517103 4.190319 3.338913 4.368509
## 
## payems
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## Apr 2019       133.2410 -18.82980 285.3119  -99.33123 365.8133
## May 2019       102.3208 -66.29431 270.9359 -155.55376 360.1954
## Jun 2019       133.3276 -55.00041 321.6556 -154.69523 421.3504
## Jul 2019       109.2348 -94.62420 313.0938 -202.54064 421.0102
## 
## nasdaq
##          Point Forecast     Lo 80     Hi 80     Lo 95    Hi 95
## Apr 2019     -107.54269 -297.7351  82.64969 -398.4168 183.3315
## May 2019       72.33887 -141.0458 285.72354 -254.0048 398.6826
## Jun 2019       53.37743 -166.3732 273.12809 -282.7022 389.4571
## Jul 2019      -20.82912 -242.8418 201.18351 -360.3681 318.7099
## 
## gas
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019    0.027394896 -0.1528655 0.2076553 -0.2482896 0.3030794
## May 2019   -0.003603685 -0.2185786 0.2113712 -0.3323794 0.3251721
## Jun 2019    0.013418809 -0.2063837 0.2332213 -0.3227400 0.3495777
## Jul 2019   -0.014556712 -0.2359026 0.2067891 -0.3530760 0.3239626
## 
## cpi
##          Point Forecast      Lo 80       Hi 80     Lo 95     Hi 95
## Apr 2019    -0.72422512 -1.4500014 0.001551141 -1.834204 0.3857539
## May 2019    -0.16313429 -0.9559253 0.629656697 -1.375603 1.0493349
## Jun 2019     0.16527297 -0.6753332 1.005879105 -1.120323 1.4508691
## Jul 2019     0.03227664 -0.8416474 0.906200655 -1.304275 1.3688281
## 
## ffr
##          Point Forecast       Lo 80      Hi 80      Lo 95     Hi 95
## Apr 2019     0.12956964 -0.02647686 0.28561614 -0.1090829 0.3682222
## May 2019    -0.03471638 -0.20339601 0.13396326 -0.2926896 0.2232569
## Jun 2019    -0.09469548 -0.26729609 0.07790513 -0.3586653 0.1692744
## Jul 2019     0.01826820 -0.15579024 0.19232664 -0.2479312 0.2844676
var.4 = VAR(df_2, p = 4)
(var.4_forecast = forecast(var.4,4))
## unrate
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.763723 3.600559 3.926887 3.514185 4.013261
## May 2019       3.740531 3.526110 3.954953 3.412602 4.068460
## Jun 2019       3.761494 3.485065 4.037924 3.338732 4.184256
## Jul 2019       3.821379 3.473623 4.169134 3.289532 4.353225
## 
## payems
##          Point Forecast      Lo 80    Hi 80      Lo 95    Hi 95
## Apr 2019       152.9755   1.722164 304.2288  -78.34650 384.2974
## May 2019       154.8804 -10.930315 320.6912  -98.70522 408.4661
## Jun 2019       136.9906 -47.726625 321.7079 -145.51002 419.4913
## Jul 2019       141.4516 -62.204926 345.1081 -170.01416 452.9173
## 
## nasdaq
##          Point Forecast      Lo 80      Hi 80     Lo 95    Hi 95
## Apr 2019     -182.33785 -367.18609   2.510388 -465.0388 100.3631
## May 2019      -55.08436 -267.86856 157.699851 -380.5097 270.3410
## Jun 2019      173.69372  -46.80719 394.194629 -163.5333 510.9208
## Jul 2019       63.07739 -159.49972 285.654504 -277.3249 403.4797
## 
## gas
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019    0.046778737 -0.1323110 0.2258685 -0.2271154 0.3206729
## May 2019    0.021559216 -0.1917252 0.2348436 -0.3046311 0.3477495
## Jun 2019    0.059523504 -0.1583505 0.2773975 -0.2736860 0.3927330
## Jul 2019   -0.005752113 -0.2266422 0.2151379 -0.3435743 0.3320701
## 
## cpi
##          Point Forecast      Lo 80     Hi 80     Lo 95     Hi 95
## Apr 2019     -0.5910216 -1.3112959 0.1292527 -1.692586 0.5105428
## May 2019     -0.1032851 -0.8862663 0.6796960 -1.300751 1.0941812
## Jun 2019      0.2194156 -0.6141956 1.0530268 -1.055483 1.4943140
## Jul 2019     -0.1192999 -0.9901350 0.7515353 -1.451127 1.2125276
## 
## ffr
##          Point Forecast       Lo 80      Hi 80      Lo 95     Hi 95
## Apr 2019    0.117871724 -0.03739561 0.27313906 -0.1195892 0.3553326
## May 2019    0.009755814 -0.15929074 0.17880237 -0.2487786 0.2682902
## Jun 2019   -0.115710495 -0.28983557 0.05841458 -0.3820118 0.1505908
## Jul 2019   -0.015134778 -0.19016232 0.15989276 -0.2828163 0.2525467
var.5 = VAR(df_2, p = 5)
(var.5_forecast = forecast(var.5,4))
## unrate
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2019       3.732872 3.571814 3.893930 3.486555 3.979189
## May 2019       3.739122 3.529462 3.948781 3.418475 4.059769
## Jun 2019       3.775919 3.501242 4.050597 3.355837 4.196002
## Jul 2019       3.775203 3.426469 4.123937 3.241861 4.308546
## 
## payems
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## Apr 2019       114.5014 -35.35119 264.3540 -114.67837 343.6812
## May 2019       194.5945  31.12072 358.0682  -55.41706 444.6060
## Jun 2019       231.0063  49.45302 412.5596  -46.65549 508.6681
## Jul 2019       174.7241 -23.48688 372.9350 -128.41342 477.8616
## 
## nasdaq
##          Point Forecast      Lo 80     Hi 80     Lo 95     Hi 95
## Apr 2019     -202.54004 -382.11671 -22.96338 -477.1789  72.09877
## May 2019      -12.62012 -217.94721 192.70696 -326.6408 301.40055
## Jun 2019      164.87106  -48.80671 378.54882 -161.9209 491.66300
## Jul 2019      -29.59249 -247.37120 188.18621 -362.6563 303.47130
## 
## gas
##          Point Forecast       Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019     0.07616408 -0.10397631 0.2563045 -0.1993369 0.3516650
## May 2019     0.06727989 -0.14665707 0.2812169 -0.2599085 0.3944682
## Jun 2019     0.12513951 -0.09306441 0.3433434 -0.2085746 0.4588536
## Jul 2019     0.02029685 -0.20059954 0.2411933 -0.3175350 0.3581287
## 
## cpi
##          Point Forecast      Lo 80     Hi 80      Lo 95     Hi 95
## Apr 2019    -0.30209427 -1.0181391 0.4139506 -1.3971904 0.7930019
## May 2019    -0.03129378 -0.8217219 0.7591344 -1.2401493 1.1775617
## Jun 2019     0.36568376 -0.4781720 1.2095395 -0.9248823 1.6562498
## Jul 2019    -0.27216249 -1.1469277 0.6026027 -1.6100004 1.0656754
## 
## ffr
##          Point Forecast         Lo 80      Hi 80       Lo 95     Hi 95
## Apr 2019     0.14598181  0.0001736102 0.29179001 -0.07701259 0.3689762
## May 2019     0.02253829 -0.1384181533 0.18349473 -0.22362335 0.2686999
## Jun 2019    -0.08585673 -0.2516085930 0.07989514 -0.33935233 0.1676389
## Jul 2019    -0.02177301 -0.1907322407 0.14718621 -0.28017385 0.2366278
# Evaluating VAR(1-5) (Unrate, Payems, Nasdaq, Gas, CPI, FFR) 
AIC(var.1)
## [1] 5062.369
AIC(var.2)
## [1] 4950.203
AIC(var.3)
## [1] 4925.694
AIC(var.4)
## [1] 4906.418
AIC(var.5)
## [1] 4863.024
BIC(var.1)
## [1] 5205.281
BIC(var.2)
## [1] 5215.259
BIC(var.3)
## [1] 5312.568
BIC(var.4)
## [1] 5414.778
BIC(var.5)
## [1] 5492.54
checkresiduals(var.1$varresult$unrate)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 23.276, df = 10, p-value = 0.009773
checkresiduals(var.2$varresult$unrate)

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals
## LM test = 18.854, df = 16, p-value = 0.2763
checkresiduals(var.3$varresult$unrate)

## 
##  Breusch-Godfrey test for serial correlation of order up to 22
## 
## data:  Residuals
## LM test = 25.883, df = 22, p-value = 0.2567
checkresiduals(var.4$varresult$unrate)

## 
##  Breusch-Godfrey test for serial correlation of order up to 28
## 
## data:  Residuals
## LM test = 39.768, df = 28, p-value = 0.06934
checkresiduals(var.5$varresult$unrate)

## 
##  Breusch-Godfrey test for serial correlation of order up to 34
## 
## data:  Residuals
## LM test = 46.444, df = 34, p-value = 0.07562
checkresiduals(var.1$varresult$payems)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 44.164, df = 10, p-value = 3.076e-06
checkresiduals(var.2$varresult$payems)

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals
## LM test = 24.803, df = 16, p-value = 0.07337
checkresiduals(var.3$varresult$payems)

## 
##  Breusch-Godfrey test for serial correlation of order up to 22
## 
## data:  Residuals
## LM test = 35.064, df = 22, p-value = 0.03815
checkresiduals(var.4$varresult$payems)

## 
##  Breusch-Godfrey test for serial correlation of order up to 28
## 
## data:  Residuals
## LM test = 50.27, df = 28, p-value = 0.006034
checkresiduals(var.5$varresult$payems)

## 
##  Breusch-Godfrey test for serial correlation of order up to 34
## 
## data:  Residuals
## LM test = 44.009, df = 34, p-value = 0.1169
# Plots 

autoplot(subset(unrate, start = 800), series = "Observed") + autolayer(var.1_forecast$forecast$unrate, series = "VAR(1)", PI = FALSE) + autolayer(var.2_forecast$forecast$unrate, series = "VAR(2)", PI = FALSE) + autolayer(var.3_forecast$forecast$unrate, series = "VAR(3)", PI = FALSE) + autolayer(var.4_forecast$forecast$unrate, series = "VAR(4)", PI = FALSE) + autolayer(var.5_forecast$forecast$unrate, series = "VAR(5)", PI = FALSE) + ggtitle("Forecasts for Unemployment Rate") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Unemployment Rate (%)")

autoplot(subset(payems, start = 800), series = "Observed") + autolayer(var.1_forecast$forecast$payems, series = "VAR(1)", PI = FALSE) + autolayer(var.2_forecast$forecast$payems, series = "VAR(2)", PI = FALSE) + autolayer(var.3_forecast$forecast$payems, series = "VAR(3)", PI = FALSE) + autolayer(var.4_forecast$forecast$payems, series = "VAR(4)", PI = FALSE) + autolayer(var.5_forecast$forecast$payems, series = "VAR(5)", PI = FALSE) + ggtitle("Forecasts for Changes in Nonfarm Payroll Employment") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)")

Residual plots and AIC/BIC values look similar across the board - so I will include all forecasts in my final average.

Multivariate Forecasts - VAR (5) Using Penalized ERM

# unrate 

# Unrate
a = glmnet(df_2[,c(2:6)],df_2[,1],family="gaussian")
predict(var.5,s=a$lambda)$fcst$unrate
##           fcst    lower    upper        CI
##  [1,] 3.732872 3.486555 3.979189 0.2463172
##  [2,] 3.739122 3.418475 4.059769 0.3206470
##  [3,] 3.775919 3.355837 4.196002 0.4200826
##  [4,] 3.775203 3.241861 4.308546 0.5333426
##  [5,] 3.730270 3.086354 4.374186 0.6439159
##  [6,] 3.707339 2.942221 4.472457 0.7651178
##  [7,] 3.712405 2.825655 4.599155 0.8867498
##  [8,] 3.761736 2.748961 4.774510 1.0127743
##  [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
b = glmnet(df_2[,c(2:6)],df_2[,1],family="mgaussian")
predict(var.5,s=b$lambda)$fcst$unrate
##           fcst    lower    upper        CI
##  [1,] 3.732872 3.486555 3.979189 0.2463172
##  [2,] 3.739122 3.418475 4.059769 0.3206470
##  [3,] 3.775919 3.355837 4.196002 0.4200826
##  [4,] 3.775203 3.241861 4.308546 0.5333426
##  [5,] 3.730270 3.086354 4.374186 0.6439159
##  [6,] 3.707339 2.942221 4.472457 0.7651178
##  [7,] 3.712405 2.825655 4.599155 0.8867498
##  [8,] 3.761736 2.748961 4.774510 1.0127743
##  [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
c = glmnet(df_2[,c(2:6)],df_2[,1],family="gaussian", alpha = 0)
predict(var.5,s=c$lambda)$fcst$unrate
##           fcst    lower    upper        CI
##  [1,] 3.732872 3.486555 3.979189 0.2463172
##  [2,] 3.739122 3.418475 4.059769 0.3206470
##  [3,] 3.775919 3.355837 4.196002 0.4200826
##  [4,] 3.775203 3.241861 4.308546 0.5333426
##  [5,] 3.730270 3.086354 4.374186 0.6439159
##  [6,] 3.707339 2.942221 4.472457 0.7651178
##  [7,] 3.712405 2.825655 4.599155 0.8867498
##  [8,] 3.761736 2.748961 4.774510 1.0127743
##  [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
d = glmnet(df_2[,c(2:6)],df_2[,1],family="mgaussian", alpha = 0)
predict(var.5,s=d$lambda)$fcst$unrate
##           fcst    lower    upper        CI
##  [1,] 3.732872 3.486555 3.979189 0.2463172
##  [2,] 3.739122 3.418475 4.059769 0.3206470
##  [3,] 3.775919 3.355837 4.196002 0.4200826
##  [4,] 3.775203 3.241861 4.308546 0.5333426
##  [5,] 3.730270 3.086354 4.374186 0.6439159
##  [6,] 3.707339 2.942221 4.472457 0.7651178
##  [7,] 3.712405 2.825655 4.599155 0.8867498
##  [8,] 3.761736 2.748961 4.774510 1.0127743
##  [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
# Payems

a = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="gaussian")
predict(var.5,s=a$lambda)$fcst$payems
##            fcst      lower    upper       CI
##  [1,] 114.50142 -114.67837 343.6812 229.1798
##  [2,] 194.59447  -55.41706 444.6060 250.0115
##  [3,] 231.00633  -46.65549 508.6681 277.6618
##  [4,] 174.72407 -128.41342 477.8616 303.1375
##  [5,] 146.18593 -178.22388 470.5957 324.4098
##  [6,] 100.65108 -244.37474 445.6769 345.0258
##  [7,] 102.56050 -256.12452 461.2455 358.6850
##  [8,] 104.99433 -263.46527 473.4539 368.4596
##  [9,]  83.20564 -292.53628 458.9476 375.7419
## [10,]  67.29079 -316.18083 450.7624 383.4716
b = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="mgaussian")
predict(var.5,s=b$lambda)$fcst$payems
##            fcst      lower    upper       CI
##  [1,] 114.50142 -114.67837 343.6812 229.1798
##  [2,] 194.59447  -55.41706 444.6060 250.0115
##  [3,] 231.00633  -46.65549 508.6681 277.6618
##  [4,] 174.72407 -128.41342 477.8616 303.1375
##  [5,] 146.18593 -178.22388 470.5957 324.4098
##  [6,] 100.65108 -244.37474 445.6769 345.0258
##  [7,] 102.56050 -256.12452 461.2455 358.6850
##  [8,] 104.99433 -263.46527 473.4539 368.4596
##  [9,]  83.20564 -292.53628 458.9476 375.7419
## [10,]  67.29079 -316.18083 450.7624 383.4716
c = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="gaussian", alpha = 0)
predict(var.5,s=c$lambda)$fcst$payems
##            fcst      lower    upper       CI
##  [1,] 114.50142 -114.67837 343.6812 229.1798
##  [2,] 194.59447  -55.41706 444.6060 250.0115
##  [3,] 231.00633  -46.65549 508.6681 277.6618
##  [4,] 174.72407 -128.41342 477.8616 303.1375
##  [5,] 146.18593 -178.22388 470.5957 324.4098
##  [6,] 100.65108 -244.37474 445.6769 345.0258
##  [7,] 102.56050 -256.12452 461.2455 358.6850
##  [8,] 104.99433 -263.46527 473.4539 368.4596
##  [9,]  83.20564 -292.53628 458.9476 375.7419
## [10,]  67.29079 -316.18083 450.7624 383.4716
d = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="mgaussian", alpha = 0)
predict(var.5,s=d$lambda)$fcst$payems
##            fcst      lower    upper       CI
##  [1,] 114.50142 -114.67837 343.6812 229.1798
##  [2,] 194.59447  -55.41706 444.6060 250.0115
##  [3,] 231.00633  -46.65549 508.6681 277.6618
##  [4,] 174.72407 -128.41342 477.8616 303.1375
##  [5,] 146.18593 -178.22388 470.5957 324.4098
##  [6,] 100.65108 -244.37474 445.6769 345.0258
##  [7,] 102.56050 -256.12452 461.2455 358.6850
##  [8,] 104.99433 -263.46527 473.4539 368.4596
##  [9,]  83.20564 -292.53628 458.9476 375.7419
## [10,]  67.29079 -316.18083 450.7624 383.4716

Bayesian Methods: Autoregression (6) using Laplace Priors

# unrate

unratelags<-data.frame(
  window(unrate,start=c(1948,7),end=c(2019,3)),
  window(stats::lag(unrate,-1),start=c(1948,7),end=c(2019,3)),
  window(stats::lag(unrate,-2),start=c(1948,7),end=c(2019,3)),
  window(stats::lag(unrate,-3),start=c(1948,7),end=c(2019,3)),
  window(stats::lag(unrate,-4),start=c(1948,7),end=c(2019,3)),
  window(stats::lag(unrate,-5),start=c(1948,7),end=c(2019,3)), 
  window(stats::lag(unrate,-6),start=c(1948,7),end=c(2019,3)))
  
colnames(unratelags)<-c("unrate","unratel1","unratel2","unratel3","unratel4","unratel5", "unratel6")

untoday<-length(unratelags$unrate) #Last observation
untodayslags<-data.frame(unratelags$unrate[untoday],
    unratelags$unratel1[untoday],unratelags$unratel2[untoday],
    unratelags$unratel3[untoday],unratelags$unratel4[untoday], 
    unratelags$unratel5[untoday],unratelags$unratel6[untoday])
names(untodayslags)<-c("unratel1","unratel2","unratel3","unratel4","unratel5", "unratel6")

options(mc.cores = parallel::detectCores())
AR6_unrate_c = stan_glm(unrate ~ unratel1 + unratel2 + unratel3 + unratel4 + unratel5 + unratel6, data = unratelags, prior = laplace())
summary(AR6_unrate_c, digits = 5)
## 
## Model Info:
## 
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      unrate ~ unratel1 + unratel2 + unratel3 + unratel4 + unratel5 + 
##     unratel6
##  algorithm:    sampling
##  priors:       see help('prior_summary')
##  sample:       4000 (posterior sample size)
##  observations: 849
##  predictors:   7
## 
## Estimates:
##                 mean      sd        2.5%      25%       50%       75%    
## (Intercept)     0.10092   0.02431   0.05374   0.08468   0.10065   0.11684
## unratel1        0.98959   0.03352   0.92614   0.96687   0.98900   1.01204
## unratel2        0.21621   0.04690   0.12508   0.18412   0.21638   0.24907
## unratel3       -0.06053   0.04735  -0.15218  -0.09354  -0.06086  -0.02887
## unratel4       -0.06263   0.04699  -0.15321  -0.09532  -0.06324  -0.03038
## unratel5        0.02284   0.04603  -0.06607  -0.00792   0.02203   0.05327
## unratel6       -0.12296   0.03312  -0.18919  -0.14475  -0.12360  -0.10020
## sigma           0.19244   0.00462   0.18381   0.18917   0.19233   0.19549
## mean_PPD        5.77068   0.00943   5.75230   5.76444   5.77046   5.77701
## log-posterior 169.88249   4.01246 161.01179 167.36055 170.35050 172.79960
##                 97.5%  
## (Intercept)     0.15024
## unratel1        1.05546
## unratel2        0.30608
## unratel3        0.03303
## unratel4        0.03033
## unratel5        0.11476
## unratel6       -0.05885
## sigma           0.20181
## mean_PPD        5.78938
## log-posterior 176.36728
## 
## Diagnostics:
##               mcse    Rhat    n_eff
## (Intercept)   0.00038 0.99957 4079 
## unratel1      0.00054 0.99946 3891 
## unratel2      0.00077 0.99960 3754 
## unratel3      0.00079 0.99938 3553 
## unratel4      0.00085 0.99995 3092 
## unratel5      0.00106 1.00051 1871 
## unratel6      0.00058 1.00022 3215 
## sigma         0.00008 1.00318 3138 
## mean_PPD      0.00016 1.00149 3357 
## log-posterior 0.16104 1.00470  621 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
predict_AR6_unrate_c = posterior_predict(AR6_unrate_c, newdata = untodayslags)
summary(predict_AR6_unrate_c)
##        1        
##  Min.   :3.131  
##  1st Qu.:3.684  
##  Median :3.814  
##  Mean   :3.813  
##  3rd Qu.:3.942  
##  Max.   :4.527
quantile(predict_AR6_unrate_c, c(0.025, 0.975))
##     2.5%    97.5% 
## 3.438197 4.192865
# Evaluation 
summary(AR6_unrate_c, digits = 5)
## 
## Model Info:
## 
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      unrate ~ unratel1 + unratel2 + unratel3 + unratel4 + unratel5 + 
##     unratel6
##  algorithm:    sampling
##  priors:       see help('prior_summary')
##  sample:       4000 (posterior sample size)
##  observations: 849
##  predictors:   7
## 
## Estimates:
##                 mean      sd        2.5%      25%       50%       75%    
## (Intercept)     0.10092   0.02431   0.05374   0.08468   0.10065   0.11684
## unratel1        0.98959   0.03352   0.92614   0.96687   0.98900   1.01204
## unratel2        0.21621   0.04690   0.12508   0.18412   0.21638   0.24907
## unratel3       -0.06053   0.04735  -0.15218  -0.09354  -0.06086  -0.02887
## unratel4       -0.06263   0.04699  -0.15321  -0.09532  -0.06324  -0.03038
## unratel5        0.02284   0.04603  -0.06607  -0.00792   0.02203   0.05327
## unratel6       -0.12296   0.03312  -0.18919  -0.14475  -0.12360  -0.10020
## sigma           0.19244   0.00462   0.18381   0.18917   0.19233   0.19549
## mean_PPD        5.77068   0.00943   5.75230   5.76444   5.77046   5.77701
## log-posterior 169.88249   4.01246 161.01179 167.36055 170.35050 172.79960
##                 97.5%  
## (Intercept)     0.15024
## unratel1        1.05546
## unratel2        0.30608
## unratel3        0.03303
## unratel4        0.03033
## unratel5        0.11476
## unratel6       -0.05885
## sigma           0.20181
## mean_PPD        5.78938
## log-posterior 176.36728
## 
## Diagnostics:
##               mcse    Rhat    n_eff
## (Intercept)   0.00038 0.99957 4079 
## unratel1      0.00054 0.99946 3891 
## unratel2      0.00077 0.99960 3754 
## unratel3      0.00079 0.99938 3553 
## unratel4      0.00085 0.99995 3092 
## unratel5      0.00106 1.00051 1871 
## unratel6      0.00058 1.00022 3215 
## sigma         0.00008 1.00318 3138 
## mean_PPD      0.00016 1.00149 3357 
## log-posterior 0.16104 1.00470  621 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
prior_summary(AR6_unrate_c)
## Priors for model 'AR6_unrate_c' 
## ------
## Intercept (after predictors centered)
##  ~ normal(location = 0, scale = 10)
##      **adjusted scale = 16.36
## 
## Coefficients
##  ~ laplace(location = [0,0,0,...], scale = [2.5,2.5,2.5,...])
##      **adjusted scale = [2.50,2.50,2.50,...]
## 
## Auxiliary (sigma)
##  ~ exponential(rate = 1)
##      **adjusted scale = 1.64 (adjusted rate = 1/adjusted scale)
## ------
## See help('prior_summary.stanreg') for more details
hist(predict_AR6_unrate_c)

autoplot(subset(unrate, start = 800), color = "blue") + ggtitle("Forecast for Unemployment Rate") + labs(x = "Date", y = "Unemployment Rate (%)") + geom_point(aes(x = 2019.33, y = 3.889), color = "red") 

# payems 

payemslags<-data.frame(
  window(payems,start=c(1939,7),end=c(2019,3)),
  window(stats::lag(payems,-1),start=c(1939,7),end=c(2019,3)),
  window(stats::lag(payems,-2),start=c(1939,7),end=c(2019,3)),
  window(stats::lag(payems,-3),start=c(1939,7),end=c(2019,3)),
  window(stats::lag(payems,-4),start=c(1939,7),end=c(2019,3)),
  window(stats::lag(payems,-5),start=c(1939,7),end=c(2019,3)), 
  window(stats::lag(payems,-6),start=c(1939,7),end=c(2019,3)))
  
colnames(payemslags)<-c("payems","payemsl1","payemsl2","payemsl3","payemsl4","payemsl5", "payemsl6")

ptoday<-length(payemslags$payems) #Last observation
ptodayslags<-data.frame(payemslags$payems[ptoday],payemslags$payemsl1[ptoday],
      payemslags$payemsl2[ptoday],payemslags$payemsl3[ptoday],payemslags$payemsl4[ptoday], payemslags$payemsl5[ptoday], payemslags$payemsl6[ptoday])
names(ptodayslags)<-c("payemsl1","payemsl2","payemsl3","payemsl4","payemsl5", "payemsl6")

options(mc.cores = parallel::detectCores())
AR6_payems_c = stan_glm(payems ~ payemsl1 + payemsl2 + payemsl3 + payemsl4 + payemsl5 + payemsl6, data = payemslags, prior = laplace())
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.95 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
summary(AR6_payems_c, digits = 5)
## 
## Model Info:
## 
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      payems ~ payemsl1 + payemsl2 + payemsl3 + payemsl4 + payemsl5 + 
##     payemsl6
##  algorithm:    sampling
##  priors:       see help('prior_summary')
##  sample:       4000 (posterior sample size)
##  observations: 956
##  predictors:   7
## 
## Estimates:
##                 mean        sd          2.5%        25%         50%      
## (Intercept)      28.74168     7.00689    14.89944    24.00002    28.76728
## payemsl1          0.25949     0.03204     0.19540     0.23805     0.25952
## payemsl2          0.26353     0.03274     0.19886     0.24152     0.26331
## payemsl3          0.14192     0.03404     0.07374     0.11896     0.14200
## payemsl4          0.08597     0.03394     0.01815     0.06299     0.08554
## payemsl5          0.10459     0.03235     0.04077     0.08274     0.10425
## payemsl6         -0.08256     0.03147    -0.14227    -0.10408    -0.08260
## sigma           174.42756     4.05059   166.74347   171.64175   174.45065
## mean_PPD        125.86869     8.12384   109.62648   120.35179   125.93586
## log-posterior -6319.22224     3.84123 -6327.82950 -6321.54233 -6318.78322
##                 75%         97.5%    
## (Intercept)      33.47942    42.35270
## payemsl1          0.28166     0.32055
## payemsl2          0.28576     0.32836
## payemsl3          0.16502     0.20880
## payemsl4          0.10853     0.15459
## payemsl5          0.12707     0.16712
## payemsl6         -0.06139    -0.01968
## sigma           177.14631   182.40338
## mean_PPD        131.40612   141.87530
## log-posterior -6316.44939 -6312.81859
## 
## Diagnostics:
##               mcse    Rhat    n_eff
## (Intercept)   0.12960 0.99941 2923 
## payemsl1      0.00050 1.00059 4174 
## payemsl2      0.00050 1.00010 4310 
## payemsl3      0.00054 0.99983 4043 
## payemsl4      0.00052 0.99958 4194 
## payemsl5      0.00052 0.99995 3854 
## payemsl6      0.00050 0.99947 3970 
## sigma         0.07102 0.99937 3253 
## mean_PPD      0.14206 0.99952 3270 
## log-posterior 0.17967 1.00225  457 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
predict_AR6_payems_c = posterior_predict(AR6_payems_c, newdata = ptodayslags)
summary(predict_AR6_payems_c)
##        1          
##  Min.   :-486.59  
##  1st Qu.:  27.31  
##  Median : 149.74  
##  Mean   : 147.06  
##  3rd Qu.: 266.10  
##  Max.   : 799.90
quantile(predict_AR6_payems_c, c(0.025, 0.975))
##      2.5%     97.5% 
## -187.9131  475.8982
autoplot(subset(payems, start = 800), color = "blue") + ggtitle("Forecast for Changes in Nonfarm Payroll Employment") + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)") + geom_point(aes(x = 2019.33, y = 194.10), color = "red")

Machine Learning Procedures: Tree, Random Forrest, and Boosting

For my ML methods, I chose to create a dataset containing the lagged values of unrate/payems up to the 6th lag value, and also added the variables that I included in my VAR model earlier, which have been proven to be related to employment.

# Adopted from Professor Childer's code 

# Unrate 
df_2 = data.frame(na.omit(cbind(unrate, stats::lag(unrate, -1), stats::lag(unrate, -2), stats::lag(unrate, -3), stats::lag(unrate, -4), stats::lag(unrate, -5), stats::lag(unrate, -6), stats::lag(payems, -1), stats::lag(nasdaq,-1), stats::lag(gas,-1), stats::lag(cpi,-1), stats::lag(ffr,-1))))
colnames(df_2) = c("unrate", "unratelag1", "unratelag2", "unratelag3", "unratelag4", "unratelag5", "unratelag6", "payems", "nasdaq", "gas", "cpi", "ffr")


set.seed(998) 

inTraining <- createDataPartition(df_2$unrate, p = .75, list = FALSE) 
training <- df_2[ inTraining,]
testing  <- df_2[-inTraining,]

rpfit <- rpart(unrate ~ ., data = training)

fitControl <- trainControl(
                           method = "repeatedcv",
                           number = 10,
                           repeats = 10)

rpfit1  <- train(unrate ~ ., data = training, 
                 method = "rpart", 
                 trControl = fitControl,
                 na.action = na.exclude)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
forestControl <- trainControl(method = "none") 
rffit1  <- train(unrate ~ ., data = training, 
                 method = "rf", 
                 trControl = forestControl,
                 na.action = na.exclude)
splitspertree<-rffit1$finalModel$mtry
numbertrees<-rffit1$finalModel$ntree

xgbfitControl <- trainControl(
                            method = "repeatedcv",
                            number = 10,
                            repeats = 10)

xgbfit1  <- train(unrate ~ ., data = training, 
                  method = "xgbTree", 
                  trControl = xgbfitControl,
                  na.action = na.exclude)

cvparam <- list(max_depth = 1, eta = 0.3, gamma=0, colsample_bytree=0.8,
                min_child_weight=1, subsample=1, objective = "reg:linear", eval_metric = "rmse")

trainfeat<-select(training,-one_of("unrate"))
testfeat<-select(testing,-one_of("unrate"))

dtrain <- xgb.DMatrix(as.matrix(trainfeat), label=training$unrate)
dtest <- xgb.DMatrix(as.matrix(testfeat), label = testing$unrate)

watchlist <- list(train = dtrain, eval = dtest)

bst <- xgb.train(params=cvparam, data=dtrain,verbose=0,nrounds=50,watchlist=watchlist)

# Forecasts 
rppreds<-predict(rpfit1$finalModel, newdata = testing) #Decision Tree
rfpreds<-predict(rffit1$finalModel, newdata = na.roughfix(testing)) #Random Forest
bstpreds<-predict(bst,newdata=as.matrix(testfeat)) #Tree Boosting

# Evaluation 
prederrors<-data.frame((rppreds-testing$unrate)^2,(rfpreds-testing$unrate)^2,
                       (bstpreds-testing$unrate)^2)
MSEvec<-colMeans(prederrors,na.rm=TRUE) 
TestRMSE<-sqrt(MSEvec) 

rprmse<-rpfit1$results$RMSE[1] 
rfrmse<-sqrt(rffit1$finalModel$mse[500]) 
bstrmse<-bst$evaluation_log$train_rmse[50] 

TrainRMSE<-c(rprmse,rfrmse,bstrmse)
resmat<-as.matrix(data.frame(TestRMSE,TrainRMSE))
fitresults<-data.frame(t(resmat))
colnames(fitresults)<-c("Tree","Random Forest","Boosting")
fitresults
##                Tree Random Forest  Boosting
## TestRMSE  0.5035231     0.2740606 0.2070531
## TrainRMSE 0.5533178     0.2469407 0.1537530
autoplot(subset(unrate, start = 800), color = "blue") + ggtitle("Forecast for Unemployment Rate") + labs(x = "Date", y = "Unemployment Rate (%)") + geom_point(aes(x = 2019.33, y = 4.528169), color = "red") + geom_point(aes(x = 2019.33, y = 3.864400), color = "magenta") + geom_point(aes(x = 2019.33, y = 3.854172), color = "darkgreen") 

# Adopted from Professor Childer's code 

# Payems 

df_2 = data.frame(na.omit(cbind(payems, stats::lag(payems, -1), stats::lag(payems, -2), stats::lag(payems, -3), stats::lag(payems, -4), stats::lag(payems, -5), stats::lag(payems, -6), stats::lag(unrate, -1), stats::lag(nasdaq,-1), stats::lag(gas,-1), stats::lag(cpi,-1), stats::lag(ffr,-1))))
colnames(df_2) = c("payems", "payemslag1", "payemslag2", "payemslag3", "payemslag4", "payemslag5", "payemslag6", "unrate", "nasdaq", "gas", "cpi", "ffr")


set.seed(998) 

inTraining <- createDataPartition(df_2$payems, p = .75, list = FALSE)
training <- df_2[ inTraining,]
testing  <- df_2[-inTraining,]

rpfit <- rpart(payems ~ ., data = training)

fitControl <- trainControl(
                           method = "repeatedcv",
                           number = 10,
                           repeats = 10)

rpfit1  <- train(payems ~ ., data = training, 
                 method = "rpart", 
                 trControl = fitControl,
                 na.action = na.exclude)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
rffit1  <- train(payems ~ ., data = training, 
                 method = "rf", 
                 trControl = forestControl,
                 na.action = na.exclude)
splitspertree<-rffit1$finalModel$mtry
numbertrees<-rffit1$finalModel$ntree

xgbfitControl <- trainControl(
                            method = "repeatedcv",
                            number = 10,
                            repeats = 10)

xgbfit1  <- train(payems ~ ., data = training, 
                  method = "xgbTree", 
                  trControl = xgbfitControl,
                  na.action = na.exclude)

cvparam <- list(max_depth = 1, eta = 0.3, gamma=0, colsample_bytree=0.8,
                min_child_weight=1, subsample=1, objective = "reg:linear", eval_metric = "rmse")

trainfeat<-select(training,-one_of("payems"))
testfeat<-select(testing,-one_of("payems"))

dtrain <- xgb.DMatrix(as.matrix(trainfeat), label=training$payems)
dtest <- xgb.DMatrix(as.matrix(testfeat), label = testing$payems)

watchlist <- list(train = dtrain, eval = dtest)

bst <- xgb.train(params=cvparam, data=dtrain,verbose=0,nrounds=50,watchlist=watchlist)

# Forecasts 
rppreds<-predict(rpfit1$finalModel, newdata = testing) #Decision Tree
rfpreds<-predict(rffit1$finalModel, newdata = na.roughfix(testing)) #Random Forest
bstpreds<-predict(bst,newdata=as.matrix(testfeat)) #Tree Boosting

# Evaluation 
prederrors<-data.frame((rppreds-testing$payems)^2,(rfpreds-testing$payems)^2,
                       (bstpreds-testing$payems)^2)
MSEvec<-colMeans(prederrors,na.rm=TRUE)
TestRMSE<-sqrt(MSEvec) 

rprmse<-rpfit1$results$RMSE[1] 
rfrmse<-sqrt(rffit1$finalModel$mse[500]) 
bstrmse<-bst$evaluation_log$train_rmse[50] 

TrainRMSE<-c(rprmse,rfrmse,bstrmse)
resmat<-as.matrix(data.frame(TestRMSE,TrainRMSE))
fitresults<-data.frame(t(resmat))
colnames(fitresults)<-c("Tree","Random Forest","Boosting")
fitresults
##               Tree Random Forest  Boosting
## TestRMSE  150.6497      121.0619 137.79966
## TrainRMSE 158.8773      119.6887  82.91631
autoplot(subset(payems, start = 800), color = "blue") + ggtitle("Forecast for Changes in Nonfarm Payroll Employment") + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)") + geom_point(aes(x = 2019.33, y = 146.7534), color = "red") + geom_point(aes(x = 2019.33, y = 213.272733), color = "magenta") + geom_point(aes(x = 2019.33, y = 246.33800), color = "darkgreen") 

For both series, the tree method seems to perform substantially worse than random forrest and boosting, so I will remove the tree forecasts from my final average.

Conclusion

For my first ever forecast (for January), I relied heavily on expert judgement and ended up doing quite poorly, and so for my second forecast (for February), I decided to equally-weight model averages, and I definitely improved my estimates, but I was still not as accurate as I felt I could be, and I realized a potentially reason for this was that I was not selective enough in my model selection - I felt that models were more similar than others in terms of diagnostics/loss measures and so I decided to keep the majority of them them in my final averaging, when perhaps being a little more selective could’ve given me better estimates. So, for my third forecast (for March), I also used an equal-weighted average but was much more selective this time, and I ended up performing quite well. Thus, I decided to carry out the same method for my final forecast, and made sure to be equally if not more selective and I also made an effort to guage the opinion of experts when formulating my final forecast.

After evaluating the baseline methods and removing forecasts that performed substantially worse in regards to diagnostics and RMSE, I decided to average the remaining forecasts to create my final forecast. The reasons that I decided to go with an equal-weight model average are the following: this method minimizes risk, reduces chance of over fitting as it is not not dependent on the data, improves square loss risk, and if all methods are performing somewhat similarily, which I feel like has been the case for most forecasts for the two series’ we have been dealing with, it gives the chance for all methods to contribute to the forecast.

For unrate, I removed the AR(1), AR(2), and tree forecasts as they underperformed compared to other methods - this resulted in a forecast of 3.79%.

For payems, I also removed the AR(1), AR(2), and tree forecasts as they underpeformed compared to other methods. Because nonfarm payrolls growth has been quite volatile recently, I decided to research how experts in the industry are expecting April’s numbers to look like and found that estimates were in the 170k-180k range, which is substantially higher than some of my estimates. I thus decided to remove two additional forecasts: VAR(2) and Var(5), which estimated the growth to be 101k and 114k respectively. Removing the necessary estimates resulted in a forecast of 170k.